In [64]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 30,12
In [65]:
train = pd.read_csv("Train1982.csv")
test= pd.read_csv("Test.csv")
submit = pd.read_csv("Sample_Submission.csv")
In [66]:
print("Train shape: " + str(train.shape))
print("Test shape: " + str(test.shape))
train1.head()
Train shape: (18288, 3)
Test shape: (5112, 2)
Out[66]:
ID Datetime Count
0 0 25-08-2012 00:00 8
1 1 25-08-2012 01:00 2
2 2 25-08-2012 02:00 6
3 3 25-08-2012 03:00 2
4 4 25-08-2012 04:00 2
In [68]:
from math import floor
#split the train data into training set and valid set
train = train.loc[:floor(2*train.shape[0]/3)]
valid = train.loc[floor(2*train.shape[0]/3):]
train.set_index('Datetime', inplace = True)
valid.set_index('Datetime', inplace = True)
test.set_index('Datetime', inplace = True)
In [69]:
print (train.shape)
print (valid.shape)
train.head()
(8129, 2)
(2710, 2)
Out[69]:
ID Count
Datetime
25-08-2012 00:00 0 8
25-08-2012 01:00 1 2
25-08-2012 02:00 2 6
25-08-2012 03:00 3 2
25-08-2012 04:00 4 2
In [70]:
#parsing the datetime data 
dataparse = lambda dates: pd.datetime.strptime(dates, "%d-%m-%Y %H:%M")
train.index = train.index.map(dataparse)
valid.index = valid.index.map(dataparse)
test.index = test.index.map(dataparse)
train.head()
Out[70]:
ID Count
Datetime
2012-08-25 00:00:00 0 8
2012-08-25 01:00:00 1 2
2012-08-25 02:00:00 2 6
2012-08-25 03:00:00 3 2
2012-08-25 04:00:00 4 2
In [178]:
ts=train['Count']
In [179]:
ts
Out[179]:
Datetime
2012-08-25 00:00:00      8
2012-08-25 01:00:00      2
2012-08-25 02:00:00      6
2012-08-25 03:00:00      2
2012-08-25 04:00:00      2
2012-08-25 05:00:00      2
2012-08-25 06:00:00      2
2012-08-25 07:00:00      2
2012-08-25 08:00:00      6
2012-08-25 09:00:00      2
2012-08-25 10:00:00      2
2012-08-25 11:00:00      6
2012-08-25 12:00:00      4
2012-08-25 13:00:00      2
2012-08-25 14:00:00      6
2012-08-25 15:00:00      2
2012-08-25 16:00:00      2
2012-08-25 17:00:00      2
2012-08-25 18:00:00      2
2012-08-25 19:00:00      2
2012-08-25 20:00:00      2
2012-08-25 21:00:00      6
2012-08-25 22:00:00      2
2012-08-25 23:00:00      2
2012-08-26 00:00:00      4
2012-08-26 01:00:00      6
2012-08-26 02:00:00      2
2012-08-26 03:00:00      4
2012-08-26 04:00:00      2
2012-08-26 05:00:00      2
                      ... 
2013-07-28 11:00:00     58
2013-07-28 12:00:00     54
2013-07-28 13:00:00     74
2013-07-28 14:00:00     74
2013-07-28 15:00:00     48
2013-07-28 16:00:00     50
2013-07-28 17:00:00     38
2013-07-28 18:00:00     46
2013-07-28 19:00:00     40
2013-07-28 20:00:00     50
2013-07-28 21:00:00     46
2013-07-28 22:00:00     62
2013-07-28 23:00:00     56
2013-07-29 00:00:00     64
2013-07-29 01:00:00     54
2013-07-29 02:00:00     38
2013-07-29 03:00:00     28
2013-07-29 04:00:00     24
2013-07-29 05:00:00     24
2013-07-29 06:00:00     38
2013-07-29 07:00:00     38
2013-07-29 08:00:00     32
2013-07-29 09:00:00     64
2013-07-29 10:00:00    144
2013-07-29 11:00:00    138
2013-07-29 12:00:00    150
2013-07-29 13:00:00    120
2013-07-29 14:00:00    104
2013-07-29 15:00:00    156
2013-07-29 16:00:00    106
Name: Count, Length: 8129, dtype: int64
In [73]:
ts.head(10)

plt.plot(ts)
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/pandas/plotting/_converter.py:129: FutureWarning: Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()
  warnings.warn(msg, FutureWarning)
Out[73]:
[<matplotlib.lines.Line2D at 0x1c1a5d2160>]
In [ ]:
import numpy as np
In [74]:
ts_log = np.log(ts)
In [75]:
moving_avg = ts_log.rolling(24).mean()
In [79]:
moving_avg.fillna(0, inplace=True)
In [81]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
    ts_log = np.log(ts)
    
    #Determing rolling statistics
    rolmean = ts_log.rolling(24).mean()
    rolstd = ts_log.rolling(24).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)
In [82]:
test_stationarity(ts)
Results of Dickey-Fuller Test:
Test Statistic                -5.789901e+00
p-value                        4.899933e-07
#Lags Used                     3.700000e+01
Number of Observations Used    8.091000e+03
Critical Value (1%)           -3.431158e+00
Critical Value (5%)           -2.861897e+00
Critical Value (10%)          -2.566960e+00
dtype: float64
In [89]:
moving_avg = ts_log.rolling(24).mean()
plt.plot(ts_log)
plt.plot(moving_avg, color = 'red')
Out[89]:
[<matplotlib.lines.Line2D at 0x116ebf8d0>]
In [90]:
train_log_moving_avg_diff = ts_log - moving_avg
In [91]:
# Since we are taking the average of 24 values, rolling mean is not defined for the first 23 values. (NaN)
train_log_moving_avg_diff.dropna(inplace = True)
test_stationarity(train_log_moving_avg_diff)
Results of Dickey-Fuller Test:
Test Statistic                  -19.619613
p-value                           0.000000
#Lags Used                       37.000000
Number of Observations Used    8068.000000
Critical Value (1%)              -3.431161
Critical Value (5%)              -2.861898
Critical Value (10%)             -2.566961
dtype: float64
In [98]:
expwighted_avg = ts_log.ewm(24).mean()
plt.plot(ts_log)
plt.plot(expwighted_avg, color='red')
Out[98]:
[<matplotlib.lines.Line2D at 0x1c1fd8bba8>]
In [99]:
train_log_moving_avg_diff = ts - moving_avg
In [96]:
# Since we are taking the average of 24 values, rolling mean is not defined for the first 23 values. (NaN)
train_log_moving_avg_diff.dropna(inplace = True)
test_stationarity(train_log_moving_avg_diff)
Results of Dickey-Fuller Test:
Test Statistic                -5.900519e+00
p-value                        2.785146e-07
#Lags Used                     3.700000e+01
Number of Observations Used    8.068000e+03
Critical Value (1%)           -3.431161e+00
Critical Value (5%)           -2.861898e+00
Critical Value (10%)          -2.566961e+00
dtype: float64
In [101]:
moving_avg = ts_log.rolling(24).mean() 
plt.plot(ts_log)
plt.plot(moving_avg, color = 'red')
Out[101]:
[<matplotlib.lines.Line2D at 0x1c227cf470>]
In [102]:
train_log_moving_avg_diff = ts_log - moving_avg
In [103]:
# Since we are taking the average of 24 values, rolling mean is not defined for the first 23 values. (NaN)
train_log_moving_avg_diff.dropna(inplace = True)
test_stationarity(train_log_moving_avg_diff)
Results of Dickey-Fuller Test:
Test Statistic                  -19.619613
p-value                           0.000000
#Lags Used                       37.000000
Number of Observations Used    8068.000000
Critical Value (1%)              -3.431161
Critical Value (5%)              -2.861898
Critical Value (10%)             -2.566961
dtype: float64
In [108]:
expwighted_avg =  ts_log.ewm(halflife=24).mean() 
plt.plot(train_log)
plt.plot(expwighted_avg, color='red')
Out[108]:
[<matplotlib.lines.Line2D at 0x1c229fc198>]
In [109]:
ts_log.ewm(halflife=24).mean()
Out[109]:
Datetime
2012-08-25 00:00:00    2.079442
2012-08-25 01:00:00    1.376286
2012-08-25 02:00:00    1.518795
2012-08-25 03:00:00    1.303357
2012-08-25 04:00:00    1.174166
2012-08-25 05:00:00    1.088098
2012-08-25 06:00:00    1.026673
2012-08-25 07:00:00    0.980648
2012-08-25 08:00:00    1.081528
2012-08-25 09:00:00    1.037451
2012-08-25 10:00:00    1.001439
2012-08-25 11:00:00    1.078255
2012-08-25 12:00:00    1.106269
2012-08-25 13:00:00    1.070907
2012-08-25 14:00:00    1.129276
2012-08-25 15:00:00    1.095723
2012-08-25 16:00:00    1.066184
2012-08-25 17:00:00    1.039988
2012-08-25 18:00:00    1.016608
2012-08-25 19:00:00    0.995622
2012-08-25 20:00:00    0.976686
2012-08-25 21:00:00    1.026027
2012-08-25 22:00:00    1.006502
2012-08-25 23:00:00    0.988661
2012-08-26 00:00:00    1.010674
2012-08-26 01:00:00    1.052783
2012-08-26 02:00:00    1.033875
2012-08-26 03:00:00    1.051967
2012-08-26 04:00:00    1.033959
2012-08-26 05:00:00    1.017218
                         ...   
2013-07-28 11:00:00    3.798994
2013-07-28 12:00:00    3.804403
2013-07-28 13:00:00    3.818627
2013-07-28 14:00:00    3.832447
2013-07-28 15:00:00    3.833550
2013-07-28 16:00:00    3.835784
2013-07-28 17:00:00    3.830142
2013-07-28 18:00:00    3.830099
2013-07-28 19:00:00    3.826079
2013-07-28 20:00:00    3.828526
2013-07-28 21:00:00    3.828529
2013-07-28 22:00:00    3.837030
2013-07-28 23:00:00    3.842391
2013-07-29 00:00:00    3.851401
2013-07-29 01:00:00    3.855317
2013-07-29 02:00:00    3.849119
2013-07-29 03:00:00    3.834403
2013-07-29 04:00:00    3.815718
2013-07-29 05:00:00    3.797565
2013-07-29 06:00:00    3.793011
2013-07-29 07:00:00    3.788586
2013-07-29 08:00:00    3.779395
2013-07-29 09:00:00    3.790199
2013-07-29 10:00:00    3.823780
2013-07-29 11:00:00    3.855194
2013-07-29 12:00:00    3.888087
2013-07-29 13:00:00    3.913691
2013-07-29 14:00:00    3.934493
2013-07-29 15:00:00    3.966245
2013-07-29 16:00:00    3.986093
Name: Count, Length: 8129, dtype: float64
In [114]:
expwighted_avg =  ts_log.ewm(halflife=24).mean()
plt.plot(train_log)
plt.plot(expwighted_avg, color='red')
Out[114]:
[<matplotlib.lines.Line2D at 0x116dc2e80>]
In [ ]:
train_log_moving_avg_diff = ts_log - moving_avg
In [118]:
#removing the trend of increasing
train_log_ewma_diff = ts_log - expwighted_avg
test_stationarity(train_log_ewma_diff)
Results of Dickey-Fuller Test:
Test Statistic                -1.712617e+01
p-value                        7.201133e-30
#Lags Used                     3.700000e+01
Number of Observations Used    8.091000e+03
Critical Value (1%)           -3.431158e+00
Critical Value (5%)           -2.861897e+00
Critical Value (10%)          -2.566960e+00
dtype: float64
In [133]:
ts_log_diff = ts_log - ts_log.shift()
test_stationarity(ts_log_diff.dropna())
Results of Dickey-Fuller Test:
Test Statistic                -1.820799e+01
p-value                        2.396667e-30
#Lags Used                     3.500000e+01
Number of Observations Used    8.092000e+03
Critical Value (1%)           -3.431158e+00
Critical Value (5%)           -2.861897e+00
Critical Value (10%)          -2.566960e+00
dtype: float64
In [134]:
ts_log_diff.head()
Out[134]:
Datetime
2012-08-25 00:00:00         NaN
2012-08-25 01:00:00   -1.386294
2012-08-25 02:00:00    1.098612
2012-08-25 03:00:00   -1.098612
2012-08-25 04:00:00    0.000000
Name: Count, dtype: float64
In [135]:
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(pd.DataFrame(ts_log).Count.values, freq = 24)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

plt.subplot(411)
plt.plot(train_log, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
In [136]:
#A closer look at the seasonality 
plt.plot(seasonal[:200],label='Sub-Seasonality')
plt.legend(loc='best')
Out[136]:
<matplotlib.legend.Legend at 0x116e5d518>
In [137]:
train_log_decompose = pd.DataFrame(residual)
train_log_decompose['date'] = ts_log.index
train_log_decompose.set_index('date', inplace = True)
train_log_decompose.dropna(inplace=True)
test_stationarity(train_log_decompose[0])
Results of Dickey-Fuller Test:
Test Statistic                  -23.008749
p-value                           0.000000
#Lags Used                       37.000000
Number of Observations Used    8067.000000
Critical Value (1%)              -3.431161
Critical Value (5%)              -2.861898
Critical Value (10%)             -2.566961
dtype: float64
In [138]:
#ACF and PACF plots:
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(ts_log_diff.dropna(), nlags=25)
lag_pacf = pacf(ts_log_diff.dropna(), nlags=25, method='ols')
#Plot ACF: 
plt.subplot(121) 
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(train_log_diff.dropna())),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(train_log_diff.dropna())),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
#Plot PACF:
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(train_log_diff.dropna())),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(train_log_diff.dropna())),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.tight_layout()
In [139]:
ts_log_diff.head()
Out[139]:
Datetime
2012-08-25 00:00:00         NaN
2012-08-25 01:00:00   -1.386294
2012-08-25 02:00:00    1.098612
2012-08-25 03:00:00   -1.098612
2012-08-25 04:00:00    0.000000
Name: Count, dtype: float64
In [140]:
from statsmodels.tsa.arima_model import ARIMA
In [141]:
model = ARIMA(ts_log, order=(1, 1, 0))  
results_AR = model.fit(disp=-1)  
plt.plot(train_log_diff.dropna())
plt.plot(results_AR.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-train_log_diff)**2))
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency H will be used.
  % freq, ValueWarning)
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:191: FutureWarning: Creating a DatetimeIndex by passing range endpoints is deprecated.  Use `pandas.date_range` instead.
  start=index[0], end=index[-1], freq=freq)
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency H will be used.
  % freq, ValueWarning)
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/scipy/signal/signaltools.py:1364: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  out_full[ind] += zi
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/scipy/signal/signaltools.py:1367: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  out = out_full[ind]
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/scipy/signal/signaltools.py:1373: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  zf = out_full[ind]
Out[141]:
Text(0.5,1,'RSS: nan')
In [142]:
model = ARIMA(ts_log, order=(0, 1, 1))  
results_ARIMA = model.fit(disp=-1)  
plt.plot(train_log_diff.dropna())
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-train_log_diff)**2))
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency H will be used.
  % freq, ValueWarning)
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:191: FutureWarning: Creating a DatetimeIndex by passing range endpoints is deprecated.  Use `pandas.date_range` instead.
  start=index[0], end=index[-1], freq=freq)
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency H will be used.
  % freq, ValueWarning)
Out[142]:
Text(0.5,1,'RSS: nan')
In [143]:
model = ARIMA(ts_log, order=(1, 1, 1))  
results_MA = model.fit(disp=-1)  
plt.plot(train_log_diff.dropna())
plt.plot(results_MA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-train_log_diff)**2))
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency H will be used.
  % freq, ValueWarning)
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:191: FutureWarning: Creating a DatetimeIndex by passing range endpoints is deprecated.  Use `pandas.date_range` instead.
  start=index[0], end=index[-1], freq=freq)
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency H will be used.
  % freq, ValueWarning)
Out[143]:
Text(0.5,1,'RSS: nan')
In [157]:
#bring the differencing back to the original scale
def check_prediction_diff(predict_diff, given_set):
    predict_diff= predict_diff.cumsum().shift().fillna(0)
    predict_base = pd.Series(np.ones(given_set.shape[0]) * np.log(given_set['Count'])[0], index = given_set.index)
    predict_log = predict_base.add(predict_diff,fill_value=0)
    predict = np.exp(predict_log)
    
    plt.plot(given_set['Count'], label = "Given set")
    plt.plot(predict, color = 'red', label = "Predict")
    plt.legend(loc= 'best')
    plt.title('RMSE: %.4f'% (np.sqrt(np.dot(predict, given_set['Count']))/given_set.shape[0]))
    
In [160]:
def check_prediction_log(predict_log, given_set):
    predict = np.exp(predict_log)
    
    plt.plot(given_set['Count'], label = "Given set")
    plt.plot(predict, color = 'red', label = "Predict")
    plt.legend(loc= 'best')
    plt.title('RMSE: %.4f'% (np.sqrt(np.dot(predict, given_set['Count']))/given_set.shape[0]))
    
In [172]:
# Tried on MA model 
model = ARIMA(train_log.dropna(), order=(0, 1, 1))  
results_ARIMA = model.fit(disp=-1)  
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
print (predictions_ARIMA_diff.head())
Datetime
2012-08-25 01:00:00    0.000396
2012-08-25 02:00:00    0.535221
2012-08-25 03:00:00   -0.254866
2012-08-25 04:00:00    0.394723
2012-08-25 05:00:00    0.186109
dtype: float64
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency H will be used.
  % freq, ValueWarning)
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:191: FutureWarning: Creating a DatetimeIndex by passing range endpoints is deprecated.  Use `pandas.date_range` instead.
  start=index[0], end=index[-1], freq=freq)
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:171: ValueWarning: No frequency information was provided, so inferred frequency H will be used.
  % freq, ValueWarning)
In [173]:
check_prediction_diff(predictions_ARIMA_diff, train)
In [174]:
def check_prediction(predict_diff, given_set):
    predict_diff= predict_diff.cumsum().shift().fillna(0)
    predict_base = pd.Series(np.ones(given_set.shape[0]) * np.log(given_set['Count'])[0], index = given_set.index)
    predict_log = predict_base.add(predict_diff,fill_value=0)
    predict = np.exp(predict_log)
    
    plt.plot(given_set['Count'], label = "Given set")
    plt.plot(predict, color = 'red', label = "Predict")
    plt.legend(loc= 'best')
    plt.title('RMSE: %.4f'% (np.sqrt(np.dot(predict, given_set['Count']))/given_set.shape[0]))
    
In [175]:
check_prediction(predictions_ARIMA_diff, train)
In [176]:
start = train.shape[0]
end = start + valid.shape[0]
valid_predict_diff = results_ARIMA.predict(start = start-1, end = end-2, typ = 'levels')
print (valid_predict_diff.head())
print (valid_predict_diff.tail())
2013-07-29 16:00:00    4.899254
2013-07-29 17:00:00    4.774995
2013-07-29 18:00:00    4.775390
2013-07-29 19:00:00    4.775786
2013-07-29 20:00:00    4.776182
Freq: H, dtype: float64
2013-11-19 09:00:00    5.845642
2013-11-19 10:00:00    5.846038
2013-11-19 11:00:00    5.846434
2013-11-19 12:00:00    5.846830
2013-11-19 13:00:00    5.847226
Freq: H, dtype: float64
/Users/venkateswarlusayana/anaconda3/lib/python3.6/site-packages/statsmodels/tsa/base/tsa_model.py:320: FutureWarning: Creating a DatetimeIndex by passing range endpoints is deprecated.  Use `pandas.date_range` instead.
  freq=base_index.freq)
In [177]:
check_prediction_log(valid_predict_diff, valid)